/*LIS Cross-section Data center in Luxembourg*/

/*email: usersupport@lisdatacenter.org*/

/*LIS Self Teaching Package 2022*/
/*Part I: Inequality, poverty, and social policy*/
/*SAS version*/

/*last change of this version of the syntax: 15-01-2022*/

/*Exercise 3: Working with household income variables (top and bottom coding and equivalence scales)*/

OPTIONS NOFMTERR NONOTES NOSOURCE NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX;
TITLE "";
DATA current ;
 SET &gt06h (KEEP=dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet did) ;
	miss_comp = 0 ;
	IF 	(dhi=. | hpub_i=. | hpub_a=. | hpub_u=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ;
	IF miss_comp = 1 THEN DELETE ;
	ipwgt =  hpopwgt*nhhmem ; 
RUN ;

TITLE "Top/bottom-coding" ;
DATA current ;
 SET current ;
	dhitb  = dhi ;
 ***Bottom and top coding / outlier detection*** ;
 * Select only records when DHI non-missing;
	IF dhitb=. THEN DELETE;
 * recode negative DHI into zero
	IF (dhi<0)  THEN dhitb=0;
	dhilog=log(dhitb); 
 * keep negatives and 0 in the overall distribution of non-missing dhi;
	IF( (dhilog=.)  AND (dhitb^=.) ) THEN dhilog=0;
	RUN;
 * detect interquartile range;
 * NOTE: Defining percentiles requires that the dataset is sorted by equivalized income ;
	PROC SORT DATA=current ;
	  BY did dhilog;
RUN ;
PROC UNIVARIATE DATA=current NOPRINT;
 VAR dhilog   ; 
   WEIGHT hpopwgt; 
    OUTPUT OUT= temp P25=q25   P75=q75; 
RUN ; 
DATA _NULL_; 
  SET temp; 
    CALL SYMPUT("b",q25); 
    CALL SYMPUT("t",q75); 
 RUN; 
DATA current ; 
SET current ;  
iqr=&t-&b; 
* detect upper bound for extreme values;
upper_bound=&t + (iqr * 3) ; 
lower_bound=&b - (iqr * 3); 
* top code income at upper bound for extreme values ; 
IF dhitb>exp(upper_bound) THEN dhitb=exp(upper_bound) ; 
IF dhitb<exp(lower_bound) THEN dhitb=exp(lower_bound); 
RUN ; 

TITLE "Income per Capita and Equivalized Income before top/bottom-coding" ;
PROC MEANS DATA=current MEAN MEDIAN MIN MAX ;
  VAR dhi dhitb;
  WEIGHT  hpopwgt ;
RUN ;

TITLE "Income per Capita and Equivalized Income after top/bottom-coding" ;
DATA current ;
 SET current ;
	edhi  = dhitb / SQRT(nhhmem) ;
	dhipc = dhitb / nhhmem       ;
RUN ;
PROC MEANS DATA=current MEAN MEDIAN MIN MAX ;
  VAR dhipc edhi ;
  WEIGHT ipwgt ;
RUN ;

